## Load libraries
library(DESeq2)
library(patchwork)
library(foreach)
library(doParallel)
library(tidyverse)

## Define main path
main_path <- file.path("/home/grocamora/RytenLab-Research/18-DGE_limma_dream")# here::here()

## Load helper functions
source(file.path(main_path, "R/hf_graph_and_themes.R"))
source(file.path(main_path, "R/hf_reports.R"))
source(file.path(main_path, "R/run_stat_test_for_all_cols_get_res.R"))

## Set options
options(dplyr.summarise.inform = FALSE)
options(lifecycle_verbosity = "warning")
knitr::opts_chunk$set(echo = T, warning = F, message = F, 
                      out.width="100%", fig.align = "center", dpi = 100,
                      fig.width = 7.2, fig.height = 7.2)

1 Background

Update of Jon’s PCA and PCA corrections report. The covariates employed in this report are extracted from the script are extracted from:

/home/grocamora/RytenLab-Research/18-DGE_limma_dream/Scripts/new_covariate_correction.R

The scripts updates the previous iteration by Jon with Aine’s latest updates (as of 07/09/2023). The selected covariates are:

GC_NC_40_59 + INTRONIC_BASES + PCT_UTR_BASES + Total_Sequences + gender + tss_up_1kb_tag_pct

As mentioned, this report relies on the previous execution of the 01-Hardy_covariate_identification.R script, since it reads the output generated by it. Thus, we need to specify the path to input files, load the metadata and blacklisted genes:

# Paths
main_path <- "/home/grocamora/RytenLab-Research/18-DGE_limma_dream"
results_path <- file.path(main_path, "results/Hardy_NewCovs")

txi.salmon_path <- file.path(main_path, "data/Hardy/txi.salmon.rds")
metadata_path <- file.path(results_path, "all_brain_areas_covariates.rds")
sample_metadata_path <- file.path(main_path, "metadata/Hardy/updated_metadata_w_PMI.csv")
sample_outliers_template_path <- file.path(results_path, "sample_outliers")
bxp_outliers_path <- file.path(results_path, "bxp_outliers.csv")

blacklist_genes <- file.path(main_path, "data", "blist_genes_no_vers.csv")
gene_map_path <- file.path(main_path, "data", "gencode_txid_to_geneid.txt")

# Load data
txi.salmon <- readRDS(txi.salmon_path)
cts <- txi.salmon$counts

metadata <- loadMetadataHardy(metadata_path, sample_metadata_path)

covariates <- c("GC_NC_40_59", "gender",
                "tss_up_1kb_tag_pct", "PCT_UTR_BASES",
                "Total_Sequences", "INTRONIC_BASES")

# Load blacklisted genes
blacklist_genes <- vroom::vroom(blacklist_genes, col_names = T, delim = ",")
blacklist_genes <- as.vector(unlist(blacklist_genes))

gencode_txid_to_geneid <- vroom::vroom(gene_map_path) %>%
  `colnames<-`(c("tx_id", "gene_id","gene_name", "description")) %>%
  dplyr::mutate(tx_id = sub("\\..+", "", tx_id),
                gene_id = sub("\\..+", "", gene_id))

# QC checks
all(colnames(cts) == metadata$bxp_id_full)

Additionally, we apply an expression threshold in which a \(fpm>1\) in 50% of the samples is required for a gene to be employed in the analysis. We also remove the genes from the blacklisted regions.

dds <- DESeq2::DESeqDataSetFromTximport(txi = txi.salmon,
                                        colData = metadata,
                                        design = reformulate(c(covariates, "Group")))

# Estimate library size correction scaling factors
dds <- DESeq2::estimateSizeFactors(dds)

# Remove blacklist genes
dds <- dds[!rownames(dds) %in% blacklist_genes, ]

# Identify genes that pass expression cutoff - then filter out genes that are
# extremely lowly expressed here, the standard cut-off set is fpm>1 in 50% or
# more of samples
isexpr <- rowSums(DESeq2::fpm(dds) > 1) >= 0.5 * ncol(dds)

# Compute log2 Fragments Per Million
quantlog.counts = log2(DESeq2::fpm(dds)[isexpr, ] + 1)

1.1 Changelog

v1.5

  • Updated to R v4.3.0

v1.4

  • Fixed covariate identification mistake.

v1.3

  • Added additional results section with studies about brain region separation.

v1.2

  • Added two different selection of outliers: 18 outliers vs. 7 outliers.

  • Added more graphs to assists with the decision of which outliers to remove.

v1.1

  • Added WGCNA approach to the outlier removal methods.

  • Updated to use a total of 16 outliers.

v1.0

  • Initial report.

2 PC Analysis by covariates

2.1 Pre-correction

First, we study if the covariates we selected group the samples in the PC plots. We select PCs until 85% of the variance is explained:

pca <- prcomp(t(quantlog.counts))

highest_pc <- which(summary(pca)$importance[3, ] > 0.85)[1]

pca_out <- pca$x %>%
  .[, seq(highest_pc)] %>%
  tibble::as_tibble(rownames = "bxp_id_full")

var_explained = pca$sdev^2 %>% 
  tibble::tibble(PC = factor(1:length(.)), 
                 variance = .) %>% 
  dplyr::mutate(pct = variance/sum(variance)*100,
                pct_cum = cumsum(pct))

var_explained_plot = var_explained %>% 
  dplyr::filter(PC %in% c(1:(highest_pc*2))) %>% 
  ggplot(aes(x = PC)) +
  geom_col(aes(y = pct), color = "black", fill = ggsci::pal_jco()(3)[3]) +
  geom_line(aes(y = pct_cum, group = 1)) +
  geom_point(aes(y = pct_cum)) +
  geom_text(aes(label = round(pct, 2), y = pct + 3)) +
  scale_y_continuous(expand = expansion(), limits = c(0, 100), 
                     breaks = c(seq(0, 100, 25), 85)) +
  geom_hline(aes(yintercept = 85, linetype = "PC_limit"), linetype = "dashed",
             show.legend = F) +
  #scale_linetype_manual(name = "PC Threshold", values = "dashed", label = "85%") +
  labs(x = "Principal component", y = "% Variance explained") +
  custom_gg_theme + theme(legend.position = "right")

var_explained_plot

As shown in the figure, we only need 5 PCs to explain up to 85% of the variance.

pca_out %>%
  dplyr::left_join(metadata %>%
                     dplyr::select(bxp_id_full, Group),
                   by = "bxp_id_full") %>%
  kruskal.test(PC1 ~ Group, data = .)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  PC1 by Group
## Kruskal-Wallis chi-squared = 9.8861, df = 4, p-value = 0.04239
r <- sapply(c(covariates, "pd"), function(covariate){
  cat("### ", covariate, " {-}\n", sep = "")
  
  plot(plotPCgroup(pca_out, metadata, covariate))
  cat("\n\n")
})

GC_NC_40_59

gender

tss_up_1kb_tag_pct

PCT_UTR_BASES

Total_Sequences

INTRONIC_BASES

pd

2.2 Post-correction

We employ limma::removeBatchEffect to covariate correct the salmon quantifications. In this example, our treatment matrix will consist of the Group variable.

covs_df <- metadata %>%
  dplyr::select(bxp_id_full, all_of(covariates), Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ scale(.))) %>%
  tibble::column_to_rownames("bxp_id_full")

design <- model.matrix(reformulate(c(covariates, "Group")), data = covs_df)
design_treatment <- design[, grepl("(^Group)|(^\\(Intercept\\)$)", colnames(design))]
design_batch <- design[, !colnames(design) %in% colnames(design_treatment)]
design_batch_numeric <- design[, covs_df %>% dplyr::select(where(is.numeric)) %>% colnames()]
design_batch_categorical <- design[, !colnames(design) %in% c(colnames(design_treatment),
                                                              colnames(design_batch_numeric))]
# Test that rownames in design df == colnames in feature matrix
unique((quantlog.counts %>% colnames()) == (design %>% rownames())) 

# Apply removeBatchEffect to correct out the effect of the covariates
quantlog.counts_covar = limma::removeBatchEffect( # based on lmFit
  quantlog.counts,
  covariates = design_batch, # continuous/ numerical covariates here
  design = design_treatment)

After the covariate correction, we process with the same PC analysis as before.

pca_adj <- prcomp(t(quantlog.counts_covar))
# pca2 <- prcomp(t(quantlog.counts), center = TRUE, scale. = FALSE)

highest_pc_adj <- which(summary(pca_adj)$importance[3, ] > 0.85)[1]

pca_adj_out <- pca_adj$x %>%
  .[, seq(highest_pc_adj)] %>%
  tibble::as_tibble(rownames = "bxp_id_full")

var_explained_adj = pca_adj$sdev^2 %>% 
  tibble::tibble(PC = factor(1:length(.)), 
                 variance = .) %>% 
  dplyr::mutate(pct = variance/sum(variance)*100,
                pct_cum = cumsum(pct))

var_explained_adj_plot = var_explained_adj %>% 
  dplyr::filter(PC %in% c(1:(highest_pc_adj*1.1))) %>% 
  ggplot(aes(x = PC)) +
  geom_col(aes(y = pct), color = "black", fill = ggsci::pal_jco()(3)[3]) +
  geom_line(aes(y = pct_cum, group = 1)) +
  geom_point(aes(y = pct_cum)) +
  geom_text(aes(label = round(pct, 1), y = pct + 3), size = 3) +
  scale_y_continuous(expand = expansion(), limits = c(0, 100), 
                     breaks = c(seq(0, 100, 25), 85)) +
  geom_hline(aes(yintercept = 85, linetype = "PC_limit"), linetype = "dashed",
             show.legend = F) +
  #scale_linetype_manual(name = "PC Threshold", values = "dashed", label = "85%") +
  labs(x = "Principal component", y = "% Variance explained") +
  custom_gg_theme + theme(legend.position = "right")

var_explained_adj_plot

This time, we need 29 PCs to reach 85% of variance explained. Only the first 9 PCs will be shown in the following plots.

pca_adj_out %>%
  dplyr::left_join(metadata %>%
                     dplyr::select(bxp_id_full, Group),
                   by = "bxp_id_full") %>%
  kruskal.test(PC1 ~ Group, data = .)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  PC1 by Group
## Kruskal-Wallis chi-squared = 5.1531, df = 4, p-value = 0.2719
r <- sapply(c(covariates, "pd"), function(covariate){
  cat("### ", covariate, " {-}\n", sep = "")
  
  plot(plotPCgroup(pca_adj_out %>% dplyr::select(1:11), metadata, covariate))
  cat("\n\n")
})

GC_NC_40_59

gender

tss_up_1kb_tag_pct

PCT_UTR_BASES

Total_Sequences

INTRONIC_BASES

pd

2.3 Brain region separation with control samples

An additional study is if the covariate correction helped to identify different brain regions in the control samples.

2.3.1 All brain regions

pca_out_control <- pca_out %>%
  dplyr::left_join(metadata %>% dplyr::select(bxp_id_full, pd, brain_region), by = "bxp_id_full") %>%
  dplyr::filter(pd == "control") %>%
  dplyr::select(-pd, -brain_region)
pca_adj_out_control <- pca_adj_out %>%
  dplyr::left_join(metadata %>% dplyr::select(bxp_id_full, pd, brain_region), by = "bxp_id_full") %>%
  dplyr::filter(pd == "control") %>%
  dplyr::select(-pd, -brain_region)

Results show that no separation power is visually observed before and after the correction.

Pre-correction

Post-correction

2.3.2 IPL vs ACG

pca_out_control_filtered <- pca_out %>%
  dplyr::left_join(metadata %>% dplyr::select(bxp_id_full, pd, brain_region), by = "bxp_id_full") %>%
  dplyr::filter(pd == "control", brain_region %in% c("IPL", "ACG")) %>%
  dplyr::select(-pd, -brain_region)
pca_adj_out_control_filtered <- pca_adj_out %>%
  dplyr::left_join(metadata %>% dplyr::select(bxp_id_full, pd, brain_region), by = "bxp_id_full") %>%
  dplyr::filter(pd == "control", brain_region %in% c("IPL", "ACG")) %>%
  dplyr::select(-pd, -brain_region)

Results show that no separation power is visually observed before and after the correction.

Pre-correction

Post-correction

3 Outlier removal

3.1 PCA with Z-score

getOutliersPC <- function(pca, var_explained = 85, max_pc = NULL, set_pc = NULL, z_score_threshold = 3){
  highest_pc <- which(summary(pca)$importance[3, ] > var_explained/100)[1]
  
  if(!is.null(set_pc)){
    max_outlier_pc <- set_pc
  }else if(!is.null(max_pc)){
    max_outlier_pc <- min(max_pc, highest_pc)
  }else{
    max_outlier_pc <- highest_pc
  }
  
  pc_outliers <- pca$x %>%
    .[, seq(max_outlier_pc)] %>%
    tibble::as_tibble(rownames = "bxp_id_full") %>%
    dplyr::left_join(metadata, by = "bxp_id_full") %>%
    dplyr::group_by(brain_region) %>%
    dplyr::mutate(across(paste0("PC", seq(max_outlier_pc)), list(zscore = ~ abs((. - mean(.))/sd(.))))) %>%
    dplyr::mutate(pc_number = max_outlier_pc,
                  z_score_pca_outlier = if_any(ends_with("zscore"), ~ . > z_score_threshold)) %>%
    dplyr::ungroup()
}

The PCA outlier detection approach extract the first n PCs and measures the \(z-score\) of the samples for each PC. Samples are grouped by brain region. A sample is selected as an outlier if any of the their PCs’ \(z-score\) is greater than the threshold set at \(>3\). Thus, the number of PCs employed to detect outliers is the main parameter to control for this approach.

For this study, we select PCs as a function of the total variance we want to explain. In the following plot, the X-axis represent the percentage of variance explained by the selected PCs, the size of the dots and the number inside them represent the number of PCs needed while the Y-axis represent the total number of outliers.

pc_outlier_summ <- parallel::mclapply(seq(35, 90, 5), mc.cores = (10), FUN = function(var_explained){
  pc_outliers <- getOutliersPC(pca_adj, var_explained)
  
  tibble::tibble(variance_explained = var_explained,
                 outliers_n = sum(pc_outliers$z_score_pca_outlier),
                 pc_number = unique(pc_outliers$pc_number))
}) %>% dplyr::bind_rows()

pc_outlier_summ %>%
  ggplot(aes(x = variance_explained, y = outliers_n)) + 
  geom_point(aes(size = pc_number)) +
  geom_line() +
  geom_vline(xintercept = c(65, 75), linetype = 2) +
  geom_text(aes(label = pc_number), vjust = 0.5, color = "white", fontface  = "bold", size = 3, show.legend = T) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)), breaks = seq(0, 50, 5), limits = c(0, NA)) +
  scale_x_continuous(expand = expansion(mult = c(0.1, 0.1)), breaks = seq(30, 90, 10)) +
  scale_size_continuous(range = c(4, 12), name = "Number of PCs") +
  labs(x = "% of variance explained", y = "Number of outliers") + 
  ggtitle("Number of outliers", subtitle = "By variance explained and number of PCs") + 
  custom_gg_theme +
  theme(legend.position = "right",
        legend.key.size = unit(0, 'cm'))

pc_outliers_6 <- getOutliersPC(pca_adj, set_pc = 6)
pc_outliers_10 <- getOutliersPC(pca_adj, set_pc = 10)

From the previous representation, it seems that two points stand out as relevant: i) 6 PCs with 4 outliers and ii) 10 PCs with 10 outliers. They precede changes in the trend.

We can further study the \(z-score\) distributions for these two different number of PCs:

Plot for 6 PCs

pc_outlier_plot <- pc_outlier_plot_df %>%
  dplyr::filter(PCs == "max_z6") %>%
  dplyr::mutate(outlier_type = ifelse(zscore > 3, "1", "0")) %>%
  ggplot(aes(x = bxp_id_full, y = zscore)) +
  geom_point(aes(color = outlier_type), show.legend = F, position = "jitter") +
  geom_hline(yintercept = c(3), linetype = 2) +
  scale_color_manual(values = c("black", "#6aa1fd")) +
  scale_y_continuous(limits = range(pc_outlier_plot_df$zscore)) +
  labs(x = "Samples", y = " Max. PCA Z-score") + 
  custom_gg_theme +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major.x = element_blank())

ggExtra::ggMarginal(pc_outlier_plot, type = "histogram") 

Plot for 10 PCs

pc_outlier_plot <- pc_outlier_plot_df %>%
  dplyr::filter(PCs == "max_z10") %>%
  dplyr::mutate(outlier_type = ifelse(zscore > 3, "1", "0")) %>%
  ggplot(aes(x = bxp_id_full, y = zscore)) +
  geom_point(aes(color = outlier_type), show.legend = F, position = "jitter") +
  geom_hline(yintercept = c(3), linetype = 2) +
  scale_color_manual(values = c("black", "#6aa1fd")) +
  scale_y_continuous(limits = range(pc_outlier_plot_df$zscore)) +
  labs(x = "Samples", y = "Max. PCA Z-score") + 
  custom_gg_theme +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major.x = element_blank())

ggExtra::ggMarginal(pc_outlier_plot, type = "histogram") 

3.2 WGCNA sample connectivity and Z-score

The WGCNA sample connectivity outlier approach also splits the samples by brain region and obtain the outliers from there. We need to decide on the \(z-score\) threshold:

getOutliersWGCNA <- function(quantlog, z_score_threshold = 2){
  mat <- quantlog
  
  # Calculate biweight midcorrelation matrix between samples
  bicor.mat <- WGCNA::bicor(mat)
  
  # Calculate signed adjacency matrix between samples
  normadj.mat <- (0.5 + 0.5*bicor.mat)^2
  
  # Calculate connectivity
  connectivity.mat <- WGCNA::fundamentalNetworkConcepts(normadj.mat)
  ku <- connectivity.mat$Connectivity
  z.ku <- (ku - mean(ku))/sqrt(var(ku))
  
  # Extract results
  sample_connectivity_res <- z.ku %>%
    stack() %>%
    dplyr::rename(bxp_id_full = ind, 
                  wgcna_connectivity_z_score = values) %>% 
    dplyr::mutate(z_score_wgcna_outlier = abs(wgcna_connectivity_z_score) > z_score_threshold) %>%
    dplyr::relocate(bxp_id_full, wgcna_connectivity_z_score)
  
  return(sample_connectivity_res)
}

getOutliersRegionWGCNA <- function(quantlog, metadata, z_score_threshold = 2){
  all_tissues <- metadata$brain_region %>% unique()
  
  sample_connectivity_res <- metadata$brain_region %>%
    unique() %>%
    parallel::mclapply(., mc.cores = length(.), FUN = function(tissue){
    bxp_ids <- metadata %>% dplyr::filter(brain_region == tissue) %>% dplyr::pull(bxp_id_full)
    
    mat <- quantlog[, bxp_ids]
    bicor.mat <- WGCNA::bicor(mat)
    
    # Calculate signed adjacency matrix between samples
    normadj.mat <- (0.5 + 0.5*bicor.mat)^2
    
    # Calculate connectivity
    connectivity.mat <- WGCNA::fundamentalNetworkConcepts(normadj.mat)
    ku <- connectivity.mat$Connectivity
    z.ku <- (ku - mean(ku))/sqrt(var(ku))
    
    # Extract results
    sample_connectivity_res <- z.ku %>%
      stack() %>%
      dplyr::rename(bxp_id_full = ind, 
                    wgcna_connectivity_z_score = values) %>% 
      dplyr::mutate(z_score_wgcna_outlier = abs(wgcna_connectivity_z_score) > z_score_threshold) %>%
      dplyr::relocate(bxp_id_full, wgcna_connectivity_z_score)
  }) %>% dplyr::bind_rows()

  return(sample_connectivity_res)
} 

wgcna_outliers_2 <- getOutliersRegionWGCNA(quantlog.counts_covar, metadata, z_score_threshold = 2)
wgcna_outliers_3 <- getOutliersRegionWGCNA(quantlog.counts_covar, metadata, z_score_threshold = 3)
wgcna_plot <- wgcna_outliers_3 %>%
  dplyr::mutate(outlier_type = case_when(abs(wgcna_connectivity_z_score) > 3 ~ "2",
                                         abs(wgcna_connectivity_z_score) > 2 ~ "1",
                                         T ~ "0")) %>%
  ggplot(aes(x = bxp_id_full, y = wgcna_connectivity_z_score)) + 
  geom_point(aes(color = outlier_type), show.legend = F) +
  scale_color_manual(values = c("black", "#66d487", "#6aa1fd")) +
  scale_y_continuous(breaks = seq(3, -6, -1)) +
  geom_hline(yintercept = c(-2, -3), linetype = 2) +
  labs(x = "Samples", y = "WGCNA Z-score") + 
  # ggtitle("Z-score of WGCNA connectivity approach") + 
  custom_gg_theme +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major.x = element_blank())

ggExtra::ggMarginal(wgcna_plot, type = "histogram") 

We observe that with \(z-score > 2\) we obtain 16 outliers, while with \(z-score > 3\) we obtain 9 outliers.

3.3 Combined approach

If we combine the WGCNA outliers with both the 6 PCs and 10 PCs outliers, we observe the following Venn diagrams:

wgcna_bxp_2 <- wgcna_outliers_2$bxp_id_full[wgcna_outliers_2$z_score_wgcna_outlier] %>% as.character()
wgcna_bxp_3 <- wgcna_outliers_3$bxp_id_full[wgcna_outliers_3$z_score_wgcna_outlier] %>% as.character()
pca_bxp_6 <- pc_outliers_6$bxp_id_full[pc_outliers_6$z_score_pca_outlier]
pca_bxp_10 <- pc_outliers_10$bxp_id_full[pc_outliers_10$z_score_pca_outlier]

outlier_list <- list("WGCNA_2" = wgcna_bxp_2, 
                     "WGCNA_3" = wgcna_bxp_3, 
                     "6 PCs" = pca_bxp_6, 
                     "10 PCs" = pca_bxp_10)

patchwork::wrap_plots(ggVennDiagram::ggVennDiagram(outlier_list[c("WGCNA_3", "6 PCs")]) + 
                        ggtitle("WGCNA with z-score > 3 & 8 PCs") +
                        theme(plot.margin = margin(0, 0, 0.5, 0, "cm")),
                      ggVennDiagram::ggVennDiagram(outlier_list[c("WGCNA_3", "10 PCs")]) + 
                        ggtitle("WGCNA with z-score > 3 & 10 PCs") +
                        theme(plot.margin = margin(0, 0, 0, 0, "cm")),
                      ggVennDiagram::ggVennDiagram(outlier_list[c("WGCNA_2", "6 PCs")]) + 
                        ggtitle("WGCNA with z-score > 2 & 8 PCs") +
                        theme(plot.margin = margin(0, 0, 0, 0, "cm")),
                      ggVennDiagram::ggVennDiagram(outlier_list[c("WGCNA_2", "10 PCs")]) + 
                        ggtitle("WGCNA with z-score > 2 & 10 PCs") +
                        theme(plot.margin = margin(0, 0, 0, 0, "cm")),
                      guides = "collect") & 
  scale_fill_gradient(limits=c(0, 16)) &
  theme(legend.position = "bottom")

Top left is the more lenient configuration while bottom right is the more stringent. A good middle approach good be to select bottom left, which includes 6 PCs and set the WGCNA z-score threshold to > 2.

An additional representation that may help decide on an outlier selection approach is to represent the \(z-scores\) of all the samples that could be selected as outliers. The green bar represent the \(z-score\) for the WGCNA approach, while the dots represent the \(z-score\) for the different PCA methods. The PCA approach always require a \(z-score>3\) to be considered a outlier, while the WGCNA may require \(>2\) or \(>3\):

outlier_df <- tibble::tibble(bxp_id_full = outlier_list %>% unlist() %>% unique) %>%
  dplyr::left_join(wgcna_outliers_3 %>% 
                     dplyr::select(bxp_id_full, wgcna = wgcna_connectivity_z_score) %>%
                     dplyr::mutate(wgcna = abs(wgcna)),
                   by = "bxp_id_full") %>%
  dplyr::left_join(pc_outliers_6 %>% 
                     dplyr::rowwise() %>%
                     dplyr::mutate(pc_6_z_score = max(c_across(ends_with("zscore")))) %>%
                     dplyr::select(bxp_id_full, pc6 = pc_6_z_score),
                   by = "bxp_id_full") %>%
  dplyr::left_join(pc_outliers_10 %>% 
                     dplyr::rowwise() %>%
                     dplyr::mutate(pc_10_z_score = max(c_across(ends_with("zscore")))) %>%
                     dplyr::select(bxp_id_full, pc10 = pc_10_z_score),
                   by = "bxp_id_full") %>%
  dplyr::arrange(wgcna) %>%
  dplyr::mutate(bxp_id_full = factor(bxp_id_full, levels = unique(.$bxp_id_full))) %>%
  tidyr::pivot_longer(cols = -bxp_id_full, names_to = "method", values_to = "zscore") %>%
  dplyr::mutate(method = factor(method, levels = c("wgcna", "pc6", "pc10")))

outlier_df %>%
  {
    ggplot(., aes(x = bxp_id_full)) +
      # geom_tile(aes(y = 3, fill = ), stat = "identity", fill = "red", size = .5, width = 1, height = 7) +
      geom_col(aes(y = zscore, fill = method),
               data = . %>% dplyr::filter(method == "wgcna"),
               color = "black", width = 0.8, alpha = 1) +
      scale_fill_manual(values = "#83dc9e", label = "WGCNA", name = "Method") +
      ggnewscale::new_scale_fill() + 
      geom_point(aes(y = zscore, fill = method, shape = method, size = method), 
                 data = . %>% dplyr::filter(method != "wgcna")) +
      scale_size_manual(values = c(3, 2.15), label = c("pc6" = "6 PCs", "pc10" = "10 PCs"), name = "") +
      scale_shape_manual(values = c(21, 24), label = c("pc6" = "6 PCs", "pc10" = "10 PCs"), name = "") +
      scale_fill_manual(values = c("pc6" = "black", "pc10" = "#6aa1fd"), 
                        label = c("pc6" = "6 PCs", "pc10" = "10 PCs"), name = "") + 
      scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
      geom_hline(yintercept = c(2, 3), linetype = 2) +
      labs(x = "Samples", y = "Z-score") + 
      ggtitle("Z-Score for different methods") +
      coord_cartesian(ylim = c(0, 6)) +
      custom_gg_theme_clip +
      theme(axis.text.x = element_text(angle = 60, hjust = 1),
            legend.key = element_rect(fill = "white"))
  }

Based on these results, we decided to test two approaches:

16 outliers (6PCs all samples)

All samples selected as outlier by any of the two methods when the \(z-score\) threshold for WGCNA is \(>2\) and up to 6 PCs are employed - all samples from the bottom left Venn diagram.

Expand to see selected samples.


The outliers selected are the following:

bxp_outliers_6 <- unique(c(wgcna_bxp_2, pca_bxp_6))

sample_outliers_6 <- metadata %>%
  dplyr::left_join(wgcna_outliers_2 %>%
                     dplyr::select(bxp_id_full, 
                                   wgcna_zscore = wgcna_connectivity_z_score, 
                                   wgcna_outlier = z_score_wgcna_outlier), 
                   by = "bxp_id_full") %>%
  dplyr::left_join(pc_outliers_6 %>%
                     dplyr::rowwise() %>%
                     dplyr::mutate(max_z = max(c_across(ends_with("zscore")))) %>%
                     dplyr::select(bxp_id_full, 
                                   pca_zscore = max_z, 
                                   pca_outlier = z_score_pca_outlier), 
                   by = "bxp_id_full") %>%
  dplyr::mutate(outlier = ifelse(wgcna_outlier | pca_outlier, T, F))

sample_outliers_6 %>%
  dplyr::filter(bxp_id_full %in% bxp_outliers_6) %>%
  dplyr::select(bxp_id_full, case_id, brain_region, Group, 
                # wgcna_zscore, pca_zscore, 
                wgcna_outlier, pca_outlier) %>%
  dplyr::arrange(case_id, brain_region, Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(., 1))) %>%
  `colnames<-`(c("BXP ID", "Individual ID", "Brain region", "Group", 
                 # "WGCNA Zscore", "PCA Zscore", 
                 "WGCNA Outlier", "PCA outlier")) %>%
  printPrettyDf(style = "md")
BXP ID Individual ID Brain region Group WGCNA Outlier PCA outlier
BX0252_343 P15/10 IPL control TRUE FALSE
BX0252_341 P15/10 MFG control TRUE FALSE
BX0252_342 P15/10 MTG control TRUE FALSE
BX0252_360 P23/12 ACG long, no dementia TRUE FALSE
BX0252_031 P28/15 IPL long, with dementia TRUE FALSE
BX0252_030 P28/15 MTG long, with dementia TRUE FALSE
BX0252_036 P40/00 ACG short, no dementia TRUE FALSE
BX0252_033 P40/00 MFG short, no dementia TRUE TRUE
BX0252_348 P43/13 ACG control TRUE FALSE
BX0252_347 P43/13 IPL control TRUE FALSE
BX0252_346 P43/13 MTG control TRUE FALSE
BX0252_081 P45/09 MFG short, no dementia TRUE TRUE
BX0252_082 P45/09 MTG short, no dementia TRUE TRUE
BX0252_177 P47/14 MFG control TRUE FALSE
BX0252_126 P49/16 MTG long, no dementia TRUE FALSE
BX0252_171 P67/10 IPL control TRUE TRUE
number_outliers <- sum(sample_outliers_6$outlier == T)
sample_outliers_6 %>% saveRDS(paste0(sample_outliers_template_path, number_outliers, ".rds"))

After removing the outliers, the number of samples per group and brain region are:

sample_outliers_6 %>% 
  dplyr::filter(!outlier) %>% 
  dplyr::select(brain_region, group) %>% 
  table() %>%
  `colnames<-`(c("Control", 
                 "Long without dementia", "Long with dementia", 
                 "Short without dementia", "Short with dementia")) %>%
  printPrettyDf(style = "md")
Control Long without dementia Long with dementia Short without dementia Short with dementia
ACG 21 11 12 12 13
IPL 19 12 11 13 13
MFG 20 12 12 11 13
MTG 20 11 11 12 13

The PCA output with the outliers drawn:

outlier_plots <- lapply(split(paste0("PC", seq(1, 8)), ceiling((1:8)/2)), function(a){
  plotZScorePCA(pc_outliers_10 %>% dplyr::mutate(z_score_outlier = bxp_id_full %in% bxp_outliers_6), a[1], a[2])
})

(outlier_plots[[1]] + outlier_plots[[2]])/
  (outlier_plots[[3]] + outlier_plots[[4]] ) +
  plot_layout(guides = "collect") &
  theme(legend.position = "top")

21 outliers (10 PCs intersection)

All samples selected as outlier by any of the two methods when the \(z-score\) threshold for WGCNA is \(>2\) and up to 10 PCs are employed - all samples from the bottom left Venn diagram.

Expand see selected samples.


The outliers selected are the following (in yellow the new ones with respect to 16 outliers):

bxp_outliers_10 <- unique(c(wgcna_bxp_2, pca_bxp_10))

sample_outliers_10 <- metadata %>%
  dplyr::left_join(wgcna_outliers_2 %>%
                     dplyr::select(bxp_id_full, 
                                   wgcna_zscore = wgcna_connectivity_z_score, 
                                   wgcna_outlier = z_score_wgcna_outlier), 
                   by = "bxp_id_full") %>%
  dplyr::left_join(pc_outliers_10 %>%
                     dplyr::rowwise() %>%
                     dplyr::mutate(max_z = max(c_across(ends_with("zscore")))) %>%
                     dplyr::select(bxp_id_full, 
                                   pca_zscore = max_z, 
                                   pca_outlier = z_score_pca_outlier), 
                   by = "bxp_id_full") %>%
  dplyr::mutate(outlier = ifelse(wgcna_outlier | pca_outlier, T, F))

sample_outlier_table <- sample_outliers_10 %>%
  dplyr::filter(bxp_id_full %in% bxp_outliers_10) %>%
  dplyr::select(bxp_id_full, case_id, brain_region, Group, 
                # wgcna_zscore, pca_zscore, 
                wgcna_outlier, pca_outlier) %>%
  dplyr::arrange(case_id, brain_region, Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(., 1)))

sample_outlier_table  %>%
  `colnames<-`(c("BXP ID", "Individual ID", "Brain region", "Group", 
                 # "WGCNA Zscore", "PCA Zscore", 
                 "WGCNA Outlier", "PCA outlier")) %>%
  printPrettyDf(style = "md") %>% 
  kableExtra::row_spec(which(!(sample_outlier_table$bxp_id_full %in% bxp_outliers_6)), bold = T, 
                       color = "black", background = "#ffff99",)
BXP ID Individual ID Brain region Group WGCNA Outlier PCA outlier
BX0252_343 P15/10 IPL control TRUE FALSE
BX0252_341 P15/10 MFG control TRUE FALSE
BX0252_342 P15/10 MTG control TRUE FALSE
BX0252_113 P16/19 MFG control FALSE TRUE
BX0252_360 P23/12 ACG long, no dementia TRUE TRUE
BX0252_031 P28/15 IPL long, with dementia TRUE FALSE
BX0252_030 P28/15 MTG long, with dementia TRUE FALSE
BX0252_036 P40/00 ACG short, no dementia TRUE FALSE
BX0252_033 P40/00 MFG short, no dementia TRUE TRUE
BX0252_348 P43/13 ACG control TRUE FALSE
BX0252_347 P43/13 IPL control TRUE FALSE
BX0252_346 P43/13 MTG control TRUE FALSE
BX0252_081 P45/09 MFG short, no dementia TRUE TRUE
BX0252_082 P45/09 MTG short, no dementia TRUE TRUE
BX0252_174 P47/11 MTG control FALSE TRUE
BX0252_177 P47/14 MFG control TRUE FALSE
BX0252_126 P49/16 MTG long, no dementia TRUE FALSE
BX0252_171 P67/10 IPL control TRUE TRUE
BX0252_023 P78/11 IPL long, no dementia FALSE TRUE
BX0252_154 P84/08 MTG long, no dementia FALSE TRUE
BX0252_182 P86/08 MTG short, no dementia FALSE TRUE
number_outliers <- sum(sample_outliers_10$outlier == T)
sample_outliers_10 %>% saveRDS(paste0(sample_outliers_template_path, number_outliers, ".rds"))
a %>% dplyr::select(bxp_id_full, case_id, brain_region, Group, outlier_a = outlier) %>% 
  dplyr::left_join(b %>% dplyr::select(bxp_id_full, case_id, brain_region, Group, outlier_b = outlier)) %>% 
  dplyr::filter(outlier_a | outlier_b)
a_table <- a %>%
  dplyr::filter(outlier) %>% 
  dplyr::select(bxp_id_full, case_id, brain_region, Group, 
                wgcna_outlier, pca_outlier) %>%
  dplyr::arrange(case_id, brain_region, Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(., 1)))

b_table <- b %>%
  dplyr::filter(outlier) %>% 
  dplyr::select(bxp_id_full, case_id, brain_region, Group, 
                wgcna_outlier, pca_outlier) %>%
  dplyr::arrange(case_id, brain_region, Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ round(., 1)))

After removing the outliers, the number of samples per group and brain region are:

sample_outliers_10 %>% 
  dplyr::filter(!outlier) %>%
  dplyr::select(brain_region, group) %>% 
  table() %>%
  `colnames<-`(c("Control", 
                 "Long without dementia", "Long with dementia", 
                 "Short without dementia", "Short with dementia")) %>%
  printPrettyDf(style = "md")
Control Long without dementia Long with dementia Short without dementia Short with dementia
ACG 21 11 12 12 13
IPL 19 11 11 13 13
MFG 19 12 12 11 13
MTG 19 10 11 11 13
outlier_plots <- lapply(split(paste0("PC", seq(1, 10)), ceiling((1:10)/2)), function(a){
  plotZScorePCA(pc_outliers_10 %>% dplyr::mutate(z_score_outlier = bxp_id_full %in% bxp_outliers_10), a[1], a[2])
})

patchwork::wrap_plots(outlier_plots, guides = "collect") &
  theme(legend.position = "top")

4 PCs without 16 outliers

bxp_outliers <- unique(c(wgcna_bxp_2, pca_bxp_6))
data.frame(bxp_outliers) %>% readr::write_delim(bxp_outliers_path, delim = ",")

covs_clean_df <- metadata %>%
  dplyr::select(bxp_id_full, all_of(covariates), Group) %>%
  dplyr::mutate(across(where(is.numeric), ~ scale(.))) %>%
  dplyr::filter(!bxp_id_full %in% bxp_outliers) %>%
  tibble::column_to_rownames("bxp_id_full")

quantlog.counts_clean <- quantlog.counts[, !colnames(quantlog.counts) %in% bxp_outliers]

design <- model.matrix(reformulate(c(covariates, "Group")), data = covs_clean_df)
design_treatment <- design[, grepl("(^Group)|(^\\(Intercept\\)$)", colnames(design))]
design_batch <- design[, !colnames(design) %in% colnames(design_treatment)]
design_batch_numeric <- design[, covs_df %>% dplyr::select(where(is.numeric)) %>% colnames()]
design_batch_categorical <- design[, !colnames(design) %in% c(colnames(design_treatment),
                                                              colnames(design_batch_numeric))]
# Test that rownames in design df == colnames in feature matrix
unique((quantlog.counts_clean %>% colnames()) == (design %>% rownames())) 

# Apply removeBatchEffect to correct out the effect of the covariates
quantlog.counts_clean_covar = limma::removeBatchEffect( # based on lmFit
  quantlog.counts_clean,
  covariates = design_batch, # continuous/ numerical covariates here
  design = design_treatment)

In the end, we decided to go for the 16 outlier option (65% of variance explained). After the covariate correction and the outlier removal, the percentage of variance explained is the following:

pca_adj <- prcomp(t(quantlog.counts_clean_covar))

highest_pc_adj <- which(summary(pca_adj)$importance[3, ] > 0.85)[1]

pca_adj_out <- pca_adj$x %>%
  .[, seq(40)] %>%
  tibble::as_tibble(rownames = "bxp_id_full")

var_explained_adj = pca_adj$sdev^2 %>% 
  tibble::tibble(PC = factor(1:length(.)), 
                 variance = .) %>% 
  dplyr::mutate(pct = variance/sum(variance)*100,
                pct_cum = cumsum(pct))

var_explained_adj_plot = var_explained_adj %>% 
  dplyr::filter(PC %in% c(1:(15*1.1))) %>% 
  ggplot(aes(x = PC)) +
  geom_col(aes(y = pct), color = "black", fill = ggsci::pal_jco()(3)[3]) +
  geom_line(aes(y = pct_cum, group = 1)) +
  geom_point(aes(y = pct_cum)) +
  geom_text(aes(label = round(pct, 1), y = pct + 3), size = 3) +
  scale_y_continuous(expand = expansion(), limits = c(0, 100), 
                     breaks = c(seq(0, 100, 25))) +
  # geom_hline(aes(yintercept = 85, linetype = "PC_limit"), linetype = "dashed", show.legend = F) +
  #scale_linetype_manual(name = "PC Threshold", values = "dashed", label = "85%") +
  labs(x = "Principal component", y = "% Variance explained") +
  custom_gg_theme + theme(legend.position = "right")

var_explained_adj_plot

pca_adj_out %>%
  dplyr::left_join(metadata %>%
                     dplyr::select(bxp_id_full, Group),
                   by = "bxp_id_full") %>%
  kruskal.test(PC1 ~ Group, data = .)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  PC1 by Group
## Kruskal-Wallis chi-squared = 2.7983, df = 4, p-value = 0.5921
r <- sapply(c(covariates, "Group"), function(covariate){
  cat("## ", covariate, " {-}\n", sep = "")
  
  plot(plotPCgroup(pca_adj_out %>% dplyr::select(1:11), metadata, covariate))
  cat("\n\n")
})

GC_NC_40_59

gender

tss_up_1kb_tag_pct

PCT_UTR_BASES

Total_Sequences

INTRONIC_BASES

Group

4.1 PC Correlation and heatmap (Post correction)

#### G: Highest PC to extract
highest_pc_name <- which(summary(pca_adj)$importance[3, ] > 0.85)[1]
highest_pc_name <- 20

#### A: spearman rho values
cors.r.numericVars = pca_adj %>%
  .[["x"]] %>%
  as.data.frame() %>%
  tibble::rownames_to_column("bxp_id_full") %>%
  tibble::tibble() %>%
  .[, 1:(highest_pc_name+1)] %>%
  dplyr::left_join(x=.,
                   y=metadata %>% dplyr::select(bxp_id_full, where(is.numeric)),
                   by="bxp_id_full") %>%
  dplyr::select(-any_of(c("bxp_id_full", "sample_id"))) %>%
  dplyr::mutate(across(c(where(is.character)), as.factor),
                across(c(where(is.factor)), as.numeric)) %>%
  as.matrix() %>%
  Hmisc::rcorr(., type = "spearman") %>%
  .$r %>%
  as.data.frame() %>%
  dplyr::select(matches("PC\\d+")) %>%
  # J: need to get rid of some of the meta variables from picard that being with PCT
  dplyr::select(-contains("PCT")) %>%
  tibble::rownames_to_column("meta_var") %>%
  dplyr::filter(!grepl("PC\\d+", meta_var)) %>%
  tidyr::pivot_longer(2:ncol(.)) %>%
  dplyr::mutate(name = as.factor(as.numeric(gsub("PC", "", name)))) %>%
  dplyr::group_by(meta_var) %>%
  dplyr::mutate(max.PC.cor = max(value),
                mean.PC.cor = mean(value)) %>%
  dplyr::ungroup()

#### A: P-values
cors.p.numericVars = pca_adj %>%
  .[["x"]] %>%
  as.data.frame() %>%
  tibble::rownames_to_column("bxp_id_full") %>%
  tibble::tibble() %>%
  .[, 1:(highest_pc_name+1)] %>%
  dplyr::left_join(x=.,
                   y=metadata %>% dplyr::select(bxp_id_full, where(is.numeric)),
                   by="bxp_id_full") %>%
  dplyr::select(-any_of(c("bxp_id_full", "sample_id"))) %>%
  dplyr::mutate(across(c(where(is.character)), as.factor),
                across(c(where(is.factor)), as.numeric)) %>%
  as.matrix() %>%
  Hmisc::rcorr(., type = "spearman") %>%
  .$P %>%
  as.data.frame() %>%
  dplyr::select(matches("PC\\d+")) %>%
  # J: need to get rid of some of the meta variables from picard that being with PCT
  dplyr::select(-contains("PCT")) %>%
  tibble::rownames_to_column("meta_var") %>%
  dplyr::filter(!grepl("PC\\d+", meta_var)) %>%
  tidyr::pivot_longer(2:ncol(.)) %>%
  dplyr::mutate(name = as.factor(as.numeric(gsub("PC", "", name))))

#### A: bind p and stat
cors.num = dplyr::left_join(
  x=cors.p.numericVars %>% dplyr::rename(p=value),
  y=cors.r.numericVars %>% dplyr::rename(stat=value),
  by=c("meta_var", "name")
) %>%
  dplyr::mutate(var_type = "continuous") %>%
  dplyr::group_by(meta_var) %>%
  dplyr::mutate(min.pvalue = min(p)) %>%
  dplyr::relocate(var_type, .after = last_col())

#### A: get PC/categorical correlations using K-W Chi-squared
#### G: added some regular expression to avoid issues with columns that are not
#### PCs.
cors.cat = pca_adj %>%
  .[["x"]] %>%
  as.data.frame() %>%
  tibble::rownames_to_column("bxp_id_full") %>%
  tibble::tibble() %>%
  .[, 1:(highest_pc_name+1)] %>%
  dplyr::left_join(x=.,
                   y=metadata %>% dplyr::select(bxp_id_full, !where(is.numeric), -case_id),
                   by="bxp_id_full") %>%
  dplyr::select(-any_of(c("bxp_id_full", "sample_id", "group", "Group"))) %>%
  # A: run K-W test
  run_stat_test_for_all_cols_get_res(in_df=., stat_test="kruskal") %>%
  # A: filter so var1 contains the numerics, var2 contains the categorical
  dplyr::filter(grepl("PC\\d+", var1),
                !grepl("PC\\d+", var2)) %>%
  dplyr::select(-test_run) %>%
  dplyr::rename(name=var1, meta_var=var2) %>%
  dplyr::group_by(meta_var) %>%
  dplyr::mutate(max.PC.cor = max(stat),
                mean.PC.cor = mean(stat),
                min.pvalue = min(p)) %>%
  dplyr::mutate(name = as.factor(as.numeric(gsub("PC", "", name)))) %>%
  dplyr::ungroup() %>%
  dplyr::relocate(meta_var, name) %>%
  dplyr::mutate(var_type="categorical")

#### A: generate dataframe containing covariate, PC axis, rho & P-value
#### corresponding to PC-covariate, variance explained by each PC first, binding
#### cont (spearman) and cat (K-S) data to get df containing stat assessment for
#### all putative covariates
cors.all = dplyr::bind_rows(
  cors.num,
  cors.cat
) %>%
  dplyr::mutate(name = as.factor(name)) %>%
  dplyr::mutate(
    padj = p.adjust(p, method="fdr"),
    is.sig = case_when(
      padj<0.05 ~ stat,
      TRUE ~ NA_integer_
    )) %>%
  # A: add variance explained
  dplyr::left_join(
    x=.,
    y=summary(pca) %>%
      .$importance %>%
      as.data.frame() %>%
      t() %>%
      as.data.frame() %>%
      tibble::rownames_to_column("name") %>%
      janitor::clean_names() %>%
      dplyr::mutate(name = as.factor(gsub("PC", "", name))),
    by="name"
  ) %>%
  # A: calculate weighted correlation - weights PC-covariate correlation coefficient by the % variance explained by that PC
  dplyr::mutate(variance_weighted_correlation = proportion_of_variance*abs(stat)) %>%
  # A: add adjusted P-value - adjust for number of putative covariates
  dplyr::group_by(name) %>%
  dplyr::mutate(padj = p.adjust(p, "fdr")) %>%
  dplyr::ungroup()


#### A: generate PC-covariate heatmaps

#### A: plot PC-cov correlations
pca.cor.num = cors.all %>%
  dplyr::mutate(name = as.numeric(as.character(name))) %>%
  dplyr::filter(name %in% c(1:highest_pc_name)) %>%
  dplyr::filter(var_type=="continuous") %>%
  dplyr::arrange(name) %>%
  ggplot(data=., aes(x=name, y=reorder(meta_var, max.PC.cor), fill=stat)) +
  geom_tile() +
  geom_text(aes(label = round(is.sig, 2)), color = "black", size = 1.75) +
  scale_fill_gradient2(low = "#075AFF",
                       mid = "#FFFFCC",
                       high = "#FF0000",
                       limits = c(-1, 1),
                       name = "Spearman's rho") +
  xlab("PC") +
  ylab("") +
  theme(axis.text.x = element_text(angle=0, vjust = 0.5, hjust=0.5),
        legend.position = "right") +
  scale_x_continuous(breaks=c(1:highest_pc_name)) +
  facet_wrap(~var_type, scales = "free")

pca.cor.cat = cors.all %>%
  dplyr::mutate(name = as.numeric(as.character(name))) %>%
  dplyr::filter(name %in% c(1:highest_pc_name)) %>%
  dplyr::filter(var_type=="categorical") %>%
  dplyr::arrange(name) %>%
  ggplot(data=., aes(x=name, y=reorder(meta_var, max.PC.cor), fill=stat)) +
  geom_tile() +
  geom_text(aes(label = ifelse(p<0.05, round(p, 2), "")), color = "black", size = 1.75) +
  scale_fill_gradient2(low = "#075AFF",
                       mid = "#FFFFCC",
                       high = "#FF0000",
                       name = "K-W chi-squared") +
  xlab("PC") +
  ylab("") +
  theme(axis.text.x = element_text(angle=0, vjust = 0.5, hjust=0.5),
        legend.position = "right") +
  scale_x_continuous(breaks=c(1:highest_pc_name)) +
  facet_wrap(~var_type, scales = "free")

#### A: use patchwork to get cateorical and numeric/cont heatmaps in one figure
pca.cor.fig = (pca.cor.num / pca.cor.cat)
pca.cor.fig

5 Additional results

5.1 Brain separation

Additionally, we wanted to study if there would be a brain area separation in the PC space after the covariate correction. Based on the collinearity plot shown before, we expect to see some separation power in the 16th PC.

plotPCgroup(pca_adj_out %>% dplyr::select(1:19), metadata, "brain_region", by_pairs = F)